Data

The data for comes from the Elements of Statistical Learning textbook; and is originally from a study by Stamey et al. (1989) in which they examined the relationship between the level of prostate-specific antigen (psa) and a number of clinical measures in men who were about to receive a prostatectomy. The variables are as follows:

These data are available in prostate.csv. Let’s start by reading in the data and some basic EDA.

prostate = read.csv(".././data/prostate.csv")
head(prostate)
##       lcavol  lweight age      lbph svi       lcp gleason pgg45       lpsa
## 1 -0.5798185 2.769459  50 -1.386294   0 -1.386294       6     0 -0.4307829
## 2 -0.9942523 3.319626  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 3 -0.5108256 2.691243  74 -1.386294   0 -1.386294       7    20 -0.1625189
## 4 -1.2039728 3.282789  58 -1.386294   0 -1.386294       6     0 -0.1625189
## 5  0.7514161 3.432373  62 -1.386294   0 -1.386294       6     0  0.3715636
## 6 -1.0498221 3.228826  50 -1.386294   0 -1.386294       6     0  0.7654678
##   train
## 1  TRUE
## 2  TRUE
## 3  TRUE
## 4  TRUE
## 5  TRUE
## 6  TRUE
summary(prostate)
##      lcavol           lweight           age             lbph        
##  Min.   :-1.3471   Min.   :2.375   Min.   :41.00   Min.   :-1.3863  
##  1st Qu.: 0.5128   1st Qu.:3.376   1st Qu.:60.00   1st Qu.:-1.3863  
##  Median : 1.4469   Median :3.623   Median :65.00   Median : 0.3001  
##  Mean   : 1.3500   Mean   :3.629   Mean   :63.87   Mean   : 0.1004  
##  3rd Qu.: 2.1270   3rd Qu.:3.876   3rd Qu.:68.00   3rd Qu.: 1.5581  
##  Max.   : 3.8210   Max.   :4.780   Max.   :79.00   Max.   : 2.3263  
##       svi              lcp             gleason          pgg45       
##  Min.   :0.0000   Min.   :-1.3863   Min.   :6.000   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:-1.3863   1st Qu.:6.000   1st Qu.:  0.00  
##  Median :0.0000   Median :-0.7985   Median :7.000   Median : 15.00  
##  Mean   :0.2165   Mean   :-0.1794   Mean   :6.753   Mean   : 24.38  
##  3rd Qu.:0.0000   3rd Qu.: 1.1787   3rd Qu.:7.000   3rd Qu.: 40.00  
##  Max.   :1.0000   Max.   : 2.9042   Max.   :9.000   Max.   :100.00  
##       lpsa           train        
##  Min.   :-0.4308   Mode :logical  
##  1st Qu.: 1.7317   FALSE:30       
##  Median : 2.5915   TRUE :67       
##  Mean   : 2.4784                  
##  3rd Qu.: 3.0564                  
##  Max.   : 5.5829
# Visualize the data (except the last column indicating the test/train split)
prostate_gg = prostate
prostate_gg$svi = as.factor(prostate_gg$svi)
ggpairs(prostate_gg, columns = 1:(ncol(prostate)-1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There are \(N=97\) observations with no missingness. We observe that:

We see that both lcavol and lcp seem to have a linear relationship with the response lpsa.

Split the data

# Split the data
train_df = data.frame(prostate[prostate$train==1,])
train_df = select(train_df, !c(train))
test_df = data.frame(prostate[prostate$train==0,])
test_df = select(test_df, !c(train))


N = dim(train_df)[1] #number of data points
D = dim(train_df) [2] -1 #number of features
Nnew = dim(test_df)[1]
print(paste("Number of data points =",N, " and number of features =", D))
## [1] "Number of data points = 67  and number of features = 8"

Fit linear regression

Let’s start by fitting a baseline linear regression model.

First, we will scale the inputs.

# Scale the data
train.m = apply(train_df, 2, mean)
train.s = apply(train_df,2,sd)
train_df = (train_df - matrix(train.m,N,D+1,byrow = TRUE))/matrix(train.s,N,D+1,byrow = TRUE)
test_df = (test_df - matrix(train.m,Nnew,D+1,byrow = TRUE))/matrix(train.s,Nnew,D+1,byrow = TRUE)

Fit the linear regression model and visualize the coefficients.

# Fit lm
lm_prostate = lm(lpsa~., data=train_df)
summary(lm_prostate)
## 
## Call:
## lm(formula = lpsa ~ ., data = train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36503 -0.28272 -0.04491  0.37209  1.23095 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.200e-16  7.205e-02   0.000  1.00000    
## lcavol       5.931e-01  1.105e-01   5.366 1.47e-06 ***
## lweight      2.423e-01  8.808e-02   2.751  0.00792 ** 
## age         -1.180e-01  8.455e-02  -1.396  0.16806    
## lbph         1.755e-01  8.538e-02   2.056  0.04431 *  
## svi          2.563e-01  1.038e-01   2.469  0.01651 *  
## lcp         -2.393e-01  1.282e-01  -1.867  0.06697 .  
## gleason     -1.732e-02  1.180e-01  -0.147  0.88389    
## pgg45        2.296e-01  1.321e-01   1.738  0.08755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5897 on 58 degrees of freedom
## Multiple R-squared:  0.6944, Adjusted R-squared:  0.6522 
## F-statistic: 16.47 on 8 and 58 DF,  p-value: 2.042e-12
tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE on the test data

yhat = predict(lm_prostate, test_df)
rmse_lm = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the linear model is ",rmse_lm))
## [1] "The RMSE for the linear model is  0.597769431021197"

Lasso

Now, we will use glmnet to fit a lasso model. First, we need to use cross-validation to tune \(\lambda\).

set.seed (1)
X = as.matrix(train_df[,1:D])
y = train_df$lpsa
cv.out=cv.glmnet(X, y, alpha=1, standardize = FALSE)
plot(cv.out)

bestlam=cv.out$lambda.min
print(bestlam)
## [1] 0.006281436

Now, let’s refit with the best lambda

glmnet.out=glmnet(X, y, alpha=1, lambda = bestlam, standardize = FALSE)
glmnet.out$beta
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                 s0
## lcavol   0.5728689
## lweight  0.2390965
## age     -0.1050754
## lbph     0.1683333
## svi      0.2434682
## lcp     -0.1981421
## gleason  .        
## pgg45    0.1953291

Add the lasso estimates to our plot:

tidy_reg_conf_int = tidy(lm_prostate,conf.int =TRUE)
tidy_reg_conf_int$lasso = as.matrix(predict(glmnet.out,type="coefficients",s=bestlam))
tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add lasso estimates
  geom_point(aes(lasso, term),shape = 2, color = 'blue') +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE on the test data

Xtest = as.matrix(test_df[,1:D])
yhat = predict(glmnet.out, s = bestlam, Xtest)
rmse_lasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the lasso is ",rmse_lasso))
## [1] "The RMSE for the lasso is  0.587015428770515"

Ridge and Elastic Net

Exercise: Try to repeat for ridge and elastic net and compare the coefficients and RMSE.

Bayesian Lasso

Run a simple Gibbs sampling algorithm.

# Define prior parameters
priors = list(s=1, a_sigma = 2, b_sigma = 1, s2_0 = 10)

I = 2000 #number of iterations

# Initialize
w_init = lm_prostate$coefficients
sigma_init = mean(lm_prostate$residuals^2)

#Run MCMC
blasso_out = gibbs_blasso(X,y, priors, I, w_init, sigma_init)
## [1] "Number of iterations completed: 100"
## [1] "Number of iterations completed: 200"
## [1] "Number of iterations completed: 300"
## [1] "Number of iterations completed: 400"
## [1] "Number of iterations completed: 500"
## [1] "Number of iterations completed: 600"
## [1] "Number of iterations completed: 700"
## [1] "Number of iterations completed: 800"
## [1] "Number of iterations completed: 900"
## [1] "Number of iterations completed: 1000"
## [1] "Number of iterations completed: 1100"
## [1] "Number of iterations completed: 1200"
## [1] "Number of iterations completed: 1300"
## [1] "Number of iterations completed: 1400"
## [1] "Number of iterations completed: 1500"
## [1] "Number of iterations completed: 1600"
## [1] "Number of iterations completed: 1700"
## [1] "Number of iterations completed: 1800"
## [1] "Number of iterations completed: 1900"
## [1] "Number of iterations completed: 2000"

Check mixing and determine burnin (note here we started with good initial values!)

# Traceplots
traceplot(as.mcmc(blasso_out$w))

traceplot(as.mcmc(blasso_out$tau))

traceplot(as.mcmc(blasso_out$sigma))

# Check other diagnostics in coda

# Remove burnin
b = 1000
blasso_out$w = blasso_out$w[b:I,]
blasso_out$tau = blasso_out$tau[b:I,]
blasso_out$sigma = blasso_out$sigma[b:I]

Let’s compare the coefficents

# Compute posterior mean and HPD intervals
bl_what = apply(blasso_out$w,2,mean)
bl_wci = apply(blasso_out$w,2,function(x){HPDinterval(as.mcmc(x))})
  
tidy_reg_conf_int$ls = tidy_reg_conf_int$estimate 
tidy_reg_conf_int$estimate = bl_what
tidy_reg_conf_int$conf.low = bl_wci[1,]
tidy_reg_conf_int$conf.high = bl_wci[2,]

tidy_reg_conf_int %>%
  filter(term != "(Intercept)") %>%
  # reorder the coefficients so that the largest is at the top of the plot
  mutate(term = fct_reorder(term, estimate)) %>%
  ggplot(aes(estimate, term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  # add lasso estimates
  geom_point(aes(lasso, term),shape = 2, color = 'blue') +
  geom_point(aes(ls, term),shape = 4, color = 'red') +
  # add in a dotted line at zero
  geom_vline(xintercept = 0, lty = 2) +
  labs(
    x = "Estimate of effect of variable on lpsa",
    y = NULL,
    title = "Coefficient plot with error bars") +
  theme_bw()

Compute the RMSE:

yhat = cbind(rep(1,Nnew),Xtest)%*%bl_what
rmse_blasso = sqrt(mean((yhat-test_df$lpsa)^2))
print(paste("The RMSE for the bayesian lasso is ",rmse_blasso))
## [1] "The RMSE for the bayesian lasso is  0.588250333489908"

Exercises

  1. Try changing s and seeing how it affects the results. What is a reasonable value based on the estimated \(\lambda\) from lasso?
  2. Try changing the intialization to a random draw from the prior. How does this affect?
  3. Try adding a step to sample \(s\).

Bayesreg

You can also try to compare with the package bayesreg, which implements a number of shrinkage priors.

library(bayesreg)
fit <- bayesreg(lpsa~., train_df, model = "gaussian", prior = "lasso", n.samples = 2000)

How do the results compare?

# Define prior parameters
traceplot(as.mcmc(t(fit$beta)))

traceplot(as.mcmc(t(fit$beta0)))

traceplot(as.mcmc(t(fit$sigma2)))

traceplot(as.mcmc(t(fit$tau2)))

# Check other diagnostics in coda

# Remove burnin
b = 1000
fit$beta = fit$beta[,b:I]
fit$beta0 = fit$beta0[b:I]
fit$sigma2 = fit$sigma2[b:I]
fit$tau2 = fit$tau2 [b:I]
# Compare scatter plot of samples from the two mcmc algorithms
coeff_names = names(train_df)[1:D]
ind1 = 1
ind2 = 6
df = data.frame(w1 = c(blasso_out$w[,ind1 +1], fit$beta[ind1,]), w2 = c(blasso_out$w[,ind2 +1], fit$beta[ind2,]), 
                mcmc = c(rep("Gibbs", dim(blasso_out$w)[1]), rep("HMC", dim(fit$beta)[2])))
ggplot(df) +
  geom_point(aes(x = w1, y =w2, color = mcmc)) + 
  theme_bw() +
  labs(x=coeff_names[ind1],y=coeff_names[ind2])